Abstract
Background Environmental insults that activate the maternal immune system are potent primers of developmental neuropathology and maternal immune activation (MIA) has emerged as a risk factor for neurodevelopmental and psychiatric disorders. Animal models of MIA provide an opportunity to identify molecular pathways that initiate disease processes and lead to neuropathology and behavioral deficits in offspring. MIA-induced behaviors are accompanied by anatomical and neurochemical alterations in adult offspring that parallel those seen in affected human populations.
Methods We performed transcriptional profiling and neuroanatomical characterization in a time course across mouse embryonic cortical development, following MIA via single injection of the viral mimic polyinosinic:polycytidylic acid (polyI:C) at E12.5. Transcriptional changes identified in the cortex of MIA offspring at E17.5 were validated and mapped to cortical neuroanatomy and cell types via protein analysis and immunohistochemistry.
Results MIA induced strong transcriptomic signatures, including induction of genes associated with hypoxia, immune signaling, and angiogenesis. The acute response identified 6h after the MIA insult was followed by changes in proliferation, neuronal and glial differentiation, and cortical lamination that emerged at E14.5 and peaked at E17.5. Decreased numbers of proliferative cell types in germinal zones and alterations in neuronal and glial cell types across cortical lamina were identified in the MIA-exposed cortex.
Conclusions MIA-induced transcriptomic signatures in fetal offspring overlap significantly with perturbations identified in neurodevelopmental disorders (NDDs), and provide novel insights into alterations in molecular and developmental timing processes linking MIA and neuropathology, potentially revealing new targets for development of novel approaches for earlier diagnosis and treatment of these disorders.
# Directory structure
github_dir <- file.path("G:\\Shared drives\\Nord Lab - Computational Projects\\MIA\\eLIFE_Clean_code_for_GitHub")
setwd(github_dir)
# Global R markdown code chunk options
knitr::opts_chunk$set(message=FALSE,
warning = FALSE,
error=FALSE,
echo=TRUE,
cache=TRUE,
fig.width = 7, fig.height = 6,
fig.align = 'left')
# R packages
library(sva)
library(RUVnormalize)
library(pheatmap)
library(edgeR)
library(GenomicFeatures)
library(mclust)
library(parallel)
library(ggplot2)
library(reshape2)
library(dplyr)
library(data.table)
library(DT)
library(RColorBrewer)
library(knitr)
library(ggrepel)
library(gridExtra)
library(grid)
library(plyr)
library(plotly)
library(Hmisc)
# Reads input files
setwd("G:/Shared drives/Nord Lab - Computational Projects/MIA/eLIFE_Clean_code_for_GitHub/")
exp.data <- read.csv("./GEO_submission files/Gene_counts.csv")
rownames(exp.data) <- exp.data$X
exp.data$X <- NULL
sample.info <- read.csv("./GEO_submission files/metadata.csv")
load("./GEO_submission files/exonic.gene.sizes")
sample.info_submission <- read.csv("GEO_submission files/Supplementary Table 5, RNA-seq sample metadata_03_02_2021.csv")
datatable(sample.info_submission,
rownames = FALSE,
filter = list(position = 'top', clear = FALSE, plain = FALSE),
options = list(paging = FALSE, scrollX=T, scrollY="200px", searching = FALSE))
A threshold of 1000 counts of Xist gene was used to identify sex
# Qualifies F or M based on the Xist expression
sex.by.rna <- c(ifelse(exp.data["Xist",]>1000,"F","M")) #
Xist_exp <- as.data.frame(reshape2::melt(exp.data["Xist",]))
Xist_exp <- cbind(Xist_exp, sex.by.rna)
Xist_exp <- arrange(Xist_exp, value)
colnames(Xist_exp) <- c("variable", "counts", "sex.by.rna")
ggplot(Xist_exp, aes(x=sex.by.rna, y=counts, colours=sex.by.rna))+
geom_jitter()+
geom_boxplot(alpha=0.2)+
theme_bw()+
labs(title="Xist read counts", x = "", y = "Read counts")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
scale_x_discrete(breaks= c("F", "M"), labels=c("Females", "Males"))
# Calculates RPKM values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)
rpkm.data_linear <- rpkm(exp.data, gene.length=gene.lengths, log=F)
rpkm.data_linear_all <- rpkm.data_linear
#write.csv(rpkm.data, file = "GEO_submission #files/rpkm_log2_24015.csv")
#write.csv(rpkm.data_linear, file = "GEO_submission #files/rpkm_linear_24015.csv")
### Removes genes with low expression ###
# Lets plot a histogram of all rpkm values in a dataset.
#dim(rpkm.data) #24015 rows and 74 columns
df <- reshape2::melt(rpkm.data)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("gene_name", "sample", "rpkm_log2","rpkm_linear")
# Histogram of all rpkm values in a dataset.
ggplot(df, aes(x=rpkm_log2))+
geom_histogram(bins = 1000, color="black")+
theme_bw()+
labs(title="Unfiltered dataset of 24015 genes",
x="log2 RPKM", y = "Read counts")+
theme(plot.title = element_text(hjust = 0.5))
# Setting the threshold for minimum rpkm value in at least 2 samples.
threshold = -2
# Nymber of genes after filtering
#sum(reshape2::melt(rowSums(rpkm.data > threshold) >= 2)$value) # 17051
# Let's filter the dataset and plot the histogram distribution
keep <- as.data.frame(reshape2::melt(rowSums(rpkm.data > threshold) >= 2))
keep$gene_name <- rownames(keep)
keep <- filter(keep, value == "TRUE")$gene_name
# NOTE: The set of included genes reported in the manuscript was determined using expression criteria from an expanded set of samples including a condition that was not used for the manuscript. The difference between the gene counts is 17195 in the manuscript from the larger set of samples, compared to 17051 genes that would pass in the set of samples used in the MIA vs. control comparison. Note that there is no difference among DEG passing FDR < 0.05 using either gene set.
original_exp.data <- read.csv("GEO_submission files/Count_gene_set_from_initial_submission.csv")
# length(keep) # 17051
# length(original_exp.data$gene_name) # 17195
keep <- original_exp.data$gene_name
#
datExpr_test <- as.data.frame(rpkm.data)
datExpr_test$gene_name <- rownames(datExpr_test)
# Filters the "keep" genes
datExpr_test <- filter(datExpr_test, gene_name %in% keep)
# Generates a matrix-type data frame
rownames(datExpr_test) <- datExpr_test$gene_name
datExpr_test$gene_name <- NULL
# Overwrites the original datExpr object.
datExpr <- datExpr_test
# Plots the histogram after filtering
df <- reshape2::melt(datExpr)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("sample", "rpkm_log2","rpkm_linear")
ggplot(df, aes(x=rpkm_log2))+
geom_histogram(bins = 1000, color="black")+
theme_bw()+
labs(title = "Dataset of 17195 genes \n after excluding genes expressed at a very low level", x="log2 RPKM", y = "Read counts")+
theme(plot.title = element_text(hjust = 0.5))
# Removing low expression genes. Overwrites exp.data object used for DE analysis ###
#dim(exp.data) #exp.data contains counts, datExpr contains rpkms
exp.data$gene_name <- rownames(exp.data)
exp.data <- filter(exp.data, gene_name %in% rownames(datExpr))
rownames(exp.data) <- exp.data$gene_name
exp.data$gene_name <- NULL
# Recalculates rpkm values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))
# RPKM calculation
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)
rpkm.data_linear <- rpkm(exp.data, gene.length=gene.lengths, log=F)
#write.csv(rpkm.data, file = "GEO_submission #files/rpkm_log2_17051.csv")
#write.csv(rpkm.data_linear, file = "GEO_submission #files/rpkm_linear_17051.csv")
setwd(github_dir)
group <- ifelse(sample.info[,"Condition"]=="Saline",1,2)
rRNA <- as.numeric(sample.info$rRNA)
sex.by.rna <- factor(sample.info$sex.by.rna)
dpc <- factor(sample.info$DPC)
response <- factor(sample.info$Response)
lane <- factor(sample.info$Lane)
source("Dimensionality reduction plots.r")
grid.arrange(PCA_plot, PCA_plot_sex, ncol = 2)
# 19.5 was used as a numerical representation of the P0 time point.
test.dpc <- as.factor(sample.info$DPC)
test.sex.by.rna <- as.factor(sample.info$sex_by_rna)
test.response <- as.factor(sample.info$Response)
test.rRNA <- sample.info$rRNA
test.group <- as.factor(sample.info$Condition)
test.lane <- as.factor(sample.info$Lane)
test.data <- exp.data
min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria
#qCML expression modeling
y <- DGEList(counts=test.data, group=test.group)
keep <- rowSums(cpm(y)>min.cpm) >=2
y <- y[keep, , keep.lib.sizes=FALSE]
## Perform simple exact test on group
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
test.pseudo <- y$pseudo.counts
pca.results <- prcomp(test.pseudo)
#PCA dataframe does not contain all samples as the exp.data, so I have to find which datapoints are control and polyic now
control.datapoints <- which(sample.info$Condition == "Saline")
polyic.datapoints <- which(sample.info$Condition == "PolyIC")
train.data <- data.frame(PCA.1=pca.results$rotation[control.datapoints,1],
PCA.2=pca.results$rotation[control.datapoints,2],
PCA.3=pca.results$rotation[control.datapoints,3],
PCA.4=pca.results$rotation[control.datapoints,4],
PCA.5=pca.results$rotation[control.datapoints,5],
rRNA=as.numeric(test.rRNA[control.datapoints]),
sex=sex.by.rna[control.datapoints],
lane=test.lane[control.datapoints],
response=test.response[control.datapoints]
)
predict.data <- data.frame(PCA.1=pca.results$rotation[polyic.datapoints,1],
PCA.2=pca.results$rotation[polyic.datapoints,2],
PCA.3=pca.results$rotation[polyic.datapoints,3],
PCA.4=pca.results$rotation[polyic.datapoints,4],
PCA.5=pca.results$rotation[polyic.datapoints,5],
rRNA=as.numeric(test.rRNA[polyic.datapoints]),
sex=sex.by.rna[polyic.datapoints],
lane=test.lane[polyic.datapoints],
response=test.response[polyic.datapoints]
)
lm.model <- lm(as.numeric(as.character(test.dpc[control.datapoints])) ~
PCA.1 + PCA.2 + PCA.3 + PCA.4 + PCA.5 + rRNA + sex + response
, data = train.data
)
# Predicts DPC from the linear model built on train data.
predict.saline <- predict(lm.model, newdata = train.data)
predict.polyic <- predict(lm.model, newdata = predict.data)
# I had some trouble with the original for loop and I replaced it with lapply. I also added the sd values and a plot representing modeled DPC values.
dpc.predict.mean.saline <- lapply(1:4, function(x) FUN=mean(predict.saline[which(as.character(test.dpc[control.datapoints])==unique(test.dpc)[x])]))
dpc.predict.sd.saline <- lapply(1:4, function(x) FUN=sd(predict.saline[which(as.character(test.dpc[control.datapoints])==unique(test.dpc)[x])]))
dpc.predict.mean.polyic <- lapply(1:4, function(x) FUN=mean(predict.polyic[which(as.character(test.dpc[polyic.datapoints])==unique(test.dpc)[x])]))
dpc.predict.sd.polyic <- lapply(1:4, function(x) FUN=sd(predict.polyic[which(as.character(test.dpc[polyic.datapoints])==unique(test.dpc)[x])]))
dpc.predict.mean.saline <- as.numeric(dpc.predict.mean.saline)
dpc.predict.sd.saline <- as.numeric(dpc.predict.sd.saline)
dpc.predict.mean.polyic <- as.numeric(dpc.predict.mean.polyic)
dpc.predict.sd.polyic <- as.numeric(dpc.predict.sd.polyic)
lm.predicted.dpc <- data.frame(dpc.predict.mean.saline, dpc.predict.sd.saline, dpc.predict.mean.polyic, dpc.predict.sd.polyic)
####
df_1 <- data.frame("Actual_DPC" = as.numeric(as.character(test.dpc[control.datapoints])),
"Predicted_DPC" = predict.saline, "Condition" = rep("Saline", length(predict.saline)))
df_2 <- data.frame("Actual_DPC" = as.numeric(as.character(test.dpc[polyic.datapoints])),
"Predicted_DPC" = predict.polyic, "Condition" = rep("Poly(I:C)", length(predict.polyic)))
df_combined <- rbind(df_1, df_2)
mean_data <- group_by(df_combined, Actual_DPC, Condition) %>%
summarise(average_predicted_dpc = mean(Predicted_DPC, na.rm = TRUE),
sd_predicted_dpc = sd(Predicted_DPC, na.rm = TRUE)
)
mean_data <- as.data.frame(mean_data)
j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]
df_combined$Condition <- ifelse(df_combined$Condition == "Saline", "Control", as.character(df_combined$Condition))
mean_data$Condition <- ifelse(mean_data$Condition == "Saline", "Control", as.character(mean_data$Condition))
Fig_4c <- ggplot()+
geom_jitter(data=df_combined, aes(x = Actual_DPC, y = Predicted_DPC, color=Condition), size = 2, height= 0.3, alpha=0.7)+
geom_line(data=mean_data, aes(x = Actual_DPC, y = average_predicted_dpc, color=Condition), size=1.5, alpha=0.7)+
theme_bw()+
scale_color_manual(values=c("Control"=j_brew_colors[2], "Poly(I:C)" = j_brew_colors[1]))+
scale_x_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
scale_y_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
labs(y = "Predicted DPC", x = "Actual DPC")+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
Fig_4c
#Nicer dataframe
predicted.dpc.df <- data.frame("Real_DPC"=c(12.5, 14.5, 17.5, 19.5), "Predicted_Saline_Mean_DPC"= dpc.predict.mean.saline, "Predicted_Saline_SD_DPC"=dpc.predict.sd.saline, "Predicted_PolyIC_Mean_DPC"= dpc.predict.mean.polyic, "Predicted_PolyIC_SD_DPC"=dpc.predict.sd.polyic)
##### T test ####
t.test.12 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="12.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="12.5")])
t.test.14 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="14.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="14.5")])
t.test.17 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="17.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="17.5")])
t.test.19 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="19.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="19.5")])
t.test.modeled.DPC.p.val <- data.frame("p-value E12.5"=t.test.12$p.value, "p-value E14.5"=t.test.14$p.value, "p-value E17.5"=t.test.17$p.value, "p-value E19.5"=t.test.19$p.value)
predicted_DPC_df <- reshape2::melt(data.frame("mean E12.5"=t.test.12$estimate, "mean E14.5"=t.test.14$estimate, "mean E17.5"=t.test.17$estimate, "mean E19.5"=t.test.19$estimate))
t_test_p_value=c(t.test.12$p.value, t.test.14$p.value, t.test.17$p.value, t.test.19$p.value)
specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k))
predicted_DPC_df_nice <- data.frame("Real_DPC"= c(12.5, 14.5, 17.5, 19.5), "Predicted_DPC_Saline"=predicted_DPC_df$value[c(1,3,5,7)], "Predicted_DPC_PolyIC"=predicted_DPC_df$value[c(2,4,6,8)], "t_test_p_value"=specify_decimal(t_test_p_value, 8))
knitr::kable(predicted_DPC_df_nice)
| Real_DPC | Predicted_DPC_Saline | Predicted_DPC_PolyIC | t_test_p_value |
|---|---|---|---|
| 12.5 | 12.95995 | 12.95008 | 0.98141714 |
| 14.5 | 14.63756 | 15.34472 | 0.12438056 |
| 17.5 | 16.68691 | 19.01691 | 0.00000121 |
| 19.5 | 19.48706 | 19.27017 | 0.40513411 |
single_timepoint_glm_function<- function(x){
control.datapoints <- intersect(control.datapoints, which(dpc == x))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x))
use.cols <- c(control.datapoints, polyic.datapoints)
test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]
min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria
design <- model.matrix(~as.factor(test.lane) + as.factor(test.sex.by.rna) + as.numeric(test.group))
y <- DGEList(counts=test.data, group=group[use.cols])
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design)
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.
glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table
glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]
glm.output.full
}
E12.5_GLM <- single_timepoint_glm_function(12.5)
E14.5_GLM <- single_timepoint_glm_function(14.5)
E17.5_GLM <- single_timepoint_glm_function(17.5)
E19.5_GLM <- single_timepoint_glm_function(19.5)
E12.5_up_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC > 0))
E12.5_down_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC < 0))
E14.5_up_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC > 0))
E14.5_down_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC < 0))
E17.5_up_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC > 0))
E17.5_down_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC < 0))
E19.5_up_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC > 0))
E19.5_down_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC < 0))
###
E12.5_up_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC > 0))
E12.5_down_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC < 0))
E14.5_up_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC > 0))
E14.5_down_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC < 0))
E17.5_up_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC > 0))
E17.5_down_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC < 0))
E19.5_up_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC > 0))
E19.5_down_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC < 0))
DE_df_n <- t(data.frame("E12.5" =c(E12.5_up_n, E12.5_down_n, E12.5_up_n_FDR, E12.5_down_n_FDR),
"E14.5" =c(E14.5_up_n, E14.5_down_n, E14.5_up_n_FDR, E14.5_down_n_FDR),
"E17.5" =c(E17.5_up_n, E17.5_down_n, E17.5_up_n_FDR, E17.5_down_n_FDR),
"P0" =c(E19.5_up_n, E19.5_down_n, E19.5_up_n_FDR, E19.5_down_n_FDR)))
colnames(DE_df_n) <- c("Upregulated", "Downregulated", "Upregulated", "Downregulated")
DE_df_n_melted <- melt(DE_df_n)
DE_df_n_melted$stat <- c(rep(", P < 0.05", 8), rep(", FDR < 0.05", 8))
DE_df_n_melted$Var2_stat <- paste(DE_df_n_melted$Var2, DE_df_n_melted$stat)
Fig1f <- ggplot(DE_df_n_melted, aes(fill=Var2_stat, group=Var2, x=Var1, y=value))+
geom_bar(position = "dodge", stat="identity", color="black")+
theme_bw()+
#scale_fill_manual(values=brewer.pal(n = 8, name = "Paired")[c(6,6,2,2 )])+
scale_fill_manual(values=c("#1F78B4", "#62a0ca", "#E31A1C", "#eb5e60"))+
theme(legend.title=element_blank())+
labs(title="Number of DE genes with P < 0.05 and FDR < 0.05", x="", y="")+
theme(plot.title = element_text(size = rel(1.2), hjust=0.5))+
geom_text(aes(label=value), position=position_dodge(width=0.9), hjust=-0.2)+
coord_flip()+
scale_x_discrete(limits = rev(levels(DE_df_n_melted$Var1)))+
theme(panel.border = element_blank(),
#legend.key = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.text.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(legend.position="bottom")
Fig1f
volcano_plot_text_2 <- function(x, title) {
x <- dplyr::filter(x, PValue <0.05)
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)
plotDat <- data.frame(x, Group=test)
plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)
p <- ggplot(plotDat, aes(x = logFC, y=-log10(PValue), fill=Group, col = Group)) +
geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
theme_light()+
scale_fill_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
scale_color_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
labs(title= title, y="", x="")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
theme(legend.text=element_text(size=16))+
theme(axis.text=element_text(size=14))+
theme(legend.title=element_blank())+
theme(axis.text=element_text(size=14))+
theme(axis.title = element_text(size=14, face = "bold"))+
theme(legend.position="bottom")+
coord_cartesian(xlim = c(-4.2, 4.2))+
geom_hline(yintercept = -log10(0.05), linetype=2)+
scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p) %>%
layout(legend = list(
orientation = "v",
y = -0.1,
font=list(
size=14
)
)
)
}
################# Version capped at different y axis levels ###############
volcano_plot_text_3 <- function(x, title) {
x <- dplyr::filter(x, PValue <0.05)
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)
plotDat <- data.frame(x, Group=test)
plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)
plotDat$log10_PValue <- -log10(plotDat$PValue)
plotDat$log10_PValue <- ifelse(plotDat$log10_PValue > 8, 8, plotDat$log10_PValue)
#scale_fill_manual(values=c("#1F78B4" - blue, "#62a0ca" - lighter blue, "#E31A1C" - red, "#eb5e60" - lighter red))+
p <- ggplot(plotDat, aes(x = logFC, y=log10_PValue, fill=Group, col = Group)) +
geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
theme_light()+
scale_fill_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
scale_color_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
labs(title= title, y="", x="")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
theme(legend.text=element_text(size=16))+
theme(axis.text=element_text(size=14))+
theme(legend.title=element_blank())+
theme(axis.text=element_text(size=14))+
theme(axis.title = element_text(size=14, face = "bold"))+
theme(legend.position="bottom")+
coord_cartesian(xlim = c(-4.2, 4.2))+
expand_limits(x = c(-4.2, 4.2))+
coord_cartesian(ylim = c(0, 8))+
geom_hline(yintercept = -log10(0.05), linetype=2)+
scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
scale_y_continuous(breaks=c(0, 2, 4, 6, 8), labels=c("0","2", "4", "6", ">8"))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p) %>%
layout(legend = list(
orientation = "v",
y = -0.1,
font=list(
size=14
)
)
)
}
p <- volcano_plot_text_3(E12.5_GLM, "E12.5")
p
datatable(E12.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
p <- volcano_plot_text_3(E14.5_GLM, "E14.5")
p
datatable(E14.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
p <- volcano_plot_text_2(E17.5_GLM, "E17.5")
p
datatable(E17.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
p <- volcano_plot_text_3(E19.5_GLM, "P0")
p
datatable(E19.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
Figure 1d
#Build dataframe of logFC values
padding <- 0
a <- data.frame("gene_name"=E12.5_GLM$gene_name, "E12.5_logFC"=E12.5_GLM$logFC+padding)
b <- data.frame("gene_name"=E14.5_GLM$gene_name, "E14.5_logFC"=E14.5_GLM$logFC+padding)
c <- data.frame("gene_name"=E17.5_GLM$gene_name, "E17.5_logFC"=E17.5_GLM$logFC+padding)
d <- data.frame("gene_name"=E19.5_GLM$gene_name, "E19.5_logFC"=E19.5_GLM$logFC+padding)
e <- merge(a, b, by="gene_name")
f <- merge(c,d, by="gene_name")
df <- merge(e,f, by="gene_name")
FDR_threshold <- 0.05
logFC_threshold <- 1
a <- dplyr::filter(E12.5_GLM, FDR < FDR_threshold &
logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
b <- dplyr::filter(E14.5_GLM, FDR < FDR_threshold &
logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
c <- dplyr::filter(E17.5_GLM, FDR < FDR_threshold &
logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
d <- dplyr::filter(E19.5_GLM, FDR < FDR_threshold &
logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
DE_genes_logFC_1_20_top <- unique(c(a, b, c, d))
#length(DE_genes_logFC_1_20_top)
#Filter DE data frame
df_1 <- dplyr::filter(df, gene_name %in% DE_genes_logFC_1_20_top)
rownames(df_1) <- df_1$gene_name
df_1$gene_name <- NULL
colnames(df_1) <- c("E12.5", "E14.5", "E17.5", "P0")
rownames(df_1) <- NULL
df_1 <- df_1[,c(1:4)]
paletteLength <- 100
test <- as.matrix(df_1)
myBreaks <- c(seq(min(test), 0, length.out=ceiling(paletteLength/2) + 1),
seq(max(test)/paletteLength, max(test), length.out=floor(paletteLength/2)))
myColor <- colorRampPalette(c("#9E0041", "#fea11d","#fea11d", "white", "#43abea", "#0a48aa", "#13008e"))(paletteLength)
Fig_1d <- pheatmap(as.matrix(df_1),
clustering_distance_rows="correlation",
cluster_rows = T,
cluster_cols = F,
legend = T,
angle_col = 0,
legend_breaks = c(-4, -2, 0, 2, 4, max(df_1)),
legend_labels = c("-4", "-2", "0", "2", "4","log2FC"),
fontsize_row = gene_name_font_size,
color = rev(myColor),
breaks = myBreaks,
main = bquote(atop("Heatmap of DE genes",
"FDR < 0.05 and log" [2] *"FC >1 or <-1")))
Fig_1d
SFARI_genes <- read.csv("./GEO_submission files/SFARI-Gene_genes_01-15-2019release_02-26-2019export.csv")
#head(SFARI_genes)
#We are using only high confidence genes associated with autism
SFARI_genes_filtered <- filter(SFARI_genes, gene.score <3)
require("biomaRt")
#Function translating human to mouse orthologs
humanToMouse <- function(x){
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
# comparison between mouse and human, returns mouse gene equivalents
genes = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
mousex <- unique(genes[, 2])
return(mousex)
}
#Returning a df matching orthologs
humanToMouse_2 <- function(x){
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
# comparison between mouse and human, returns mouse gene equivalents
genes = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
mousex <- genes
return(mousex)
}
#Translating orthologs
SFARI_genes_mouse <- humanToMouse_2(SFARI_genes$gene.symbol)
#SFARI_genes_mouse
SFARI_genes_with_ortho <- merge(SFARI_genes, SFARI_genes_mouse, by.x= "gene.symbol", by.y = "HGNC.symbol")
#making sure still have all the 87 SFARI genes with gene.score 1 and 2
x <- filter(SFARI_genes_with_ortho, gene.score %in% c(1,2))[,c(1,9)]
SFARI_df_to_save <- filter(SFARI_genes_with_ortho, gene.score %in% c(1,2))[ ,c(1, 3:9)]
colnames(SFARI_df_to_save)[8] <- "MGI.symbol_ortholog"
#write.csv(SFARI_df_to_save, file="Supplementary Table 1, #High_confidence_SFARI_mouse_orthologs.csv",row.names = F)
#length(unique(x$gene.symbol)) # 87 Human SFARI genes
#length(unique(x$MGI.symbol)) # 89 mouse orthologs; MSNP1AS doesn't have an ortholog; DDX3X has 2 orthologs, SRCAP has 2 orthologs too.
#setdiff(SFARI_genes_filtered$gene.symbol, SFARI_genes_with_ortho$gene.symbol)
#### Circular enrichment plot and permutation test enrichment analysis####
`%notin%` <- Negate(`%in%`)
SFARI_DE_df_annotation <- function(x){
cat_1 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 1)$MGI.symbol)
cat_2 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 2)$MGI.symbol)
cat_3 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 3)$MGI.symbol)
cat_4 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 4)$MGI.symbol)
cat_5 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 5)$MGI.symbol)
cat_6 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 6)$MGI.symbol)
cat_NA <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == "NA")$MGI.symbol)
cat_Non_Ausitm <- filter(x, gene_name %notin% filter(SFARI_genes_with_ortho, gene.score %in% c(1, 2,3,4,5,6, "NA"))$MGI.symbol)
cat_1$SFARI_category <- rep(1, nrow(cat_1))
cat_2$SFARI_category <- rep(2, nrow(cat_2))
cat_3$SFARI_category <- rep(3, nrow(cat_3))
cat_4$SFARI_category <- rep(4, nrow(cat_4))
cat_5$SFARI_category <- rep(5, nrow(cat_5))
cat_6$SFARI_category <- rep(6, nrow(cat_6))
cat_NA$SFARI_category <- rep(NA, nrow(cat_NA))
cat_Non_Ausitm$SFARI_category <- rep("Non_Ausitm", nrow(cat_Non_Ausitm))
test <- rbind(cat_1, cat_2, cat_3, cat_4, cat_5, cat_6, cat_NA, cat_Non_Ausitm)
test$Up_or_Down <- ifelse(test$logFC > 0, "Upregulated", "Downregulated")
test
}
E12.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E12.5_GLM)
E14.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E14.5_GLM)
E17.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E17.5_GLM)
E19.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E19.5_GLM)
E12.5_GLM_SFARI_cat$DPC <- rep("E12.5", nrow(E12.5_GLM_SFARI_cat))
E14.5_GLM_SFARI_cat$DPC <- rep("E14.5", nrow(E14.5_GLM_SFARI_cat))
E17.5_GLM_SFARI_cat$DPC <- rep("E17.5", nrow(E17.5_GLM_SFARI_cat))
E19.5_GLM_SFARI_cat$DPC <- rep("E19.5", nrow(E19.5_GLM_SFARI_cat))
#Checking the number of SFARI genes with gene.score 1 or 2 at each timepoint
#filter(E12.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E14.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E17.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E19.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
SFARI_genes_filtered_mouse <- x$MGI.symbol
#Double checking how many of the SFARI orthologs overlap with genes in DE analysis
#length(unique(SFARI_genes_filtered_mouse)) #There are 89 mouse orthologs of gene.score 1 and 2 SFARI genes
#filter(E12.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E14.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E17.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E19.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#The missing 7 genes are just not sufficiently expressed.
##### Stacked circular bar plot - the one in figure 2 #####
df <- rbind(E12.5_GLM_SFARI_cat, E14.5_GLM_SFARI_cat, E17.5_GLM_SFARI_cat, E19.5_GLM_SFARI_cat)
# Filtering SFARI genes with gene.score 1 and 2
df_test <- filter(df, SFARI_category %in% c(1,2))
#I'm replacing the real values of logFC to 1 and -1 to build a stacked bar plot
df_test$logFC <- ifelse(df_test$logFC > 0, 1, -1)
#Annotates if a gene is significant
df_test$Sig <- as.character(ifelse(df_test$PValue < 0.05, "DE", "Non-DE"))
#A combination fo DPC and significance
df_test$DPC_Sig <- paste(df_test$DPC, df_test$Sig, sep="_")
my_col <- brewer.pal(n = 8, name = "Set1")
#head(df_test)
#Add Up or down
df_test$dir <- ifelse(df_test$logFC > 0, "Up", "Down") # Redundant
df_test$DPC_Sig_dir <- paste(df_test$DPC, df_test$Sig, df_test$dir, sep = "_")
df_test <- arrange(df_test, gene_name)
#Yes, I checked the gene names are the same in the columns
#all(filter(df_test, DPC == "E12.5")$gene_name == filter(df_test, DPC == "E14.5")$gene_name)
#all(filter(df_test, DPC == "E17.5")$gene_name == filter(df_test, DPC == "E19.5")$gene_name)
#all(filter(df_test, DPC == "E12.5")$gene_name == filter(df_test, DPC == "E17.5")$gene_name)
## A better matrix for clustering with 0 of non-DE logFC values
df_test$logFC_sig <- ifelse(df_test$PValue > 0.05, 0, df_test$logFC)
df_matrix <- data.frame("gene_name" = filter(df_test, DPC == "E12.5")$gene_name,
"E12.5" = filter(df_test, DPC == "E12.5")$logFC_sig,
"E14.5" = filter(df_test, DPC == "E14.5")$logFC_sig,
"E17.5" = filter(df_test, DPC == "E17.5")$logFC_sig,
"E19.5" = filter(df_test, DPC == "E19.5")$logFC_sig)
#head(df_matrix)
rownames(df_matrix) <- df_matrix$gene_name
df_matrix$gene_name <- NULL
# Dissimilarity matrix
d <- dist(df_matrix, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "ward.D2" )
# Plot dendrogram
#plot(hc1, cex = 0.6, hang = -1)
#head(df_test)
df_test$gene_name <- factor(df_test$gene_name, levels = unique(df_test$gene_name)[hc1$order])
df_test <- arrange(df_test, gene_name)
### Angle of labels
step <- 360/length(unique(df_test$gene_name))
myAng <- seq(0, 360, step)
myAng <- rev(myAng)
myAng <- myAng - 90
myAng <- ifelse(myAng > 90 & myAng < 270, myAng-180, myAng)
#Adding FDR
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E12.5", "E12.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E14.5", "E14.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E17.5", "E17.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E19.5", "E19.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E12.5", "E12.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E14.5", "E14.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E17.5", "E17.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E19.5", "E19.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$Sig_dir <- ifelse(df_test$PValue < 0.05 & df_test$logFC == -1, "P_Down", "Non-Sig")
df_test$Sig_dir <- ifelse(df_test$PValue < 0.05 & df_test$logFC == 1, "P_Up", df_test$Sig_dir)
df_test$Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1, "P_Down", df_test$Sig_dir)
df_test$Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1, "P_Up", df_test$Sig_dir)
#####
Fig1g <- ggplot(df_test, aes(x=gene_name, y = abs(logFC), fill=DPC_Sig_dir))+
geom_bar(stat = "identity", alpha = 1, position = "stack")+
annotate("rect", xmin=0, xmax=Inf, ymin=-1, ymax=0, alpha=1, fill="white")+
annotate("rect", xmin=0, xmax=Inf, ymin=0, ymax=1, alpha=0.5, fill="grey10")+
annotate("rect", xmin=0, xmax=Inf, ymin=1, ymax=2, alpha=0.5, fill="grey30")+
annotate("rect", xmin=0, xmax=Inf, ymin=2, ymax=3, alpha=0.5, fill="grey60")+
annotate("rect", xmin=0, xmax=Inf, ymin=3, ymax=4, alpha=0.5, fill="grey90")+
geom_bar(stat = "identity", alpha = 1, position = "stack")+
coord_polar()+
theme_minimal()+
geom_hline(yintercept = 0, size=1, linetype=1)+
geom_hline(yintercept = 1, size=1, linetype=1)+
geom_hline(yintercept = 2, size=1, linetype=1)+
geom_hline(yintercept = 3, size=1, linetype=1)+
geom_hline(yintercept = 4, size=1, linetype=1)+
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())+
labs(x="", y="")+
theme(legend.position = "none")+
scale_fill_manual(values = c(
"E12.5_DE_Up"= "#eb8889",
"E14.5_DE_Up" = "#eb8889",
"E17.5_DE_Up" = "#eb8889",
"E19.5_DE_Up" = "#eb8889",
"E12.5_Non-DE_Up"= NA,
"E14.5_Non-DE_Up" = NA,
"E17.5_Non-DE_Up" = NA,
"E19.5_Non-DE_Up" = NA,
"E12.5_DE_Down"= "#83aecc",
"E14.5_DE_Down" = "#83aecc",
"E17.5_DE_Down" = "#83aecc",
"E19.5_DE_Down" = "#83aecc",
"E12.5_Non-DE_Down"= NA,
"E14.5_Non-DE_Down" = NA,
"E17.5_Non-DE_Down" = NA,
"E19.5_Non-DE_Down" = NA,
############################
"E12.5_Down_FDR" = my_col[2],
"E14.5_Down_FDR" = my_col[2],
"E17.5_Down_FDR" = my_col[2],
"E19.5_Down_FDR" = my_col[2],
"E12.5_Up_FDR" = my_col[1],
"E14.5_Up_FDR" = my_col[1],
"E17.5_Up_FDR" = my_col[1],
"E19.5_Up_FDR" = my_col[1]
))+
ylim(-1,4)+
theme(axis.text.x = element_text(angle = myAng, size =13, color="black"))+
theme(panel.border = element_blank(),
legend.key = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())+
annotate(geom = "text", x = 72, y = 0.5, label = "P0", color = "black",
angle = 50, size =10)+
annotate(geom = "text", x = 72, y = 1.46, label = "E17.5", color = "black",
angle = 50, size =10)+
annotate(geom = "text", x = 72, y = 2.47, label = "E14.5", color = "black",
angle = 50, size =10)+
annotate(geom = "text", x = 72, y = 3.47, label = "E12.5", color = "black",
angle = 50, size =10)
Fig1g
##### Permutation test per GOmpers et al. #####
# Code adopted from Alex's analysis
#All genes in the background
#Enrichment fo SFARI genes among DE genes.
permutation_test <- function(x, plot.name){
# x <- E17.5_GLM_module_SFARI_cat
binary.score.reference <- ifelse(x$gene_name %in% filter(x, SFARI_category %in% c(1,2))$gene_name, 1,0)
binary.score.test <- ifelse(filter(x, PValue < 0.05)$gene_name %in% filter(x, PValue <0.05, SFARI_category %in% c(1,2))$gene_name, 1,0)
geneset.perm.test <- function(binary.score.reference, binary.score.test, iterations, plot.name) {
binary.score.reference <- ifelse(is.na(binary.score.reference),0,binary.score.reference)
binary.score.test <- ifelse(is.na(binary.score.test),0,binary.score.test)
count.criteria <- vector(length=iterations)
test.size <- length(binary.score.test)
for (index in 1:iterations) {
count.criteria[index] <- sum(sample(binary.score.reference, test.size, replace = F))
}
x.min <- min(c(count.criteria, sum(binary.score.test))) - 20
if (x.min < 0) {x.min <- 0}
x.max<- max(c(count.criteria, sum(binary.score.test))) + 20
z <- abs(mean(count.criteria) - sum(binary.score.test))/sd(count.criteria)
p <- 2*pnorm(-abs(z))
e <- sum(binary.score.test)/mean(count.criteria)
prop <- sum(binary.score.test)/length(binary.score.test)
bg <- length(binary.score.reference)
hist(count.criteria, col="gray", xlab="Count", ylab="Frequency",
main=paste(plot.name, " \ncount=", sum(binary.score.test), " of ", length(binary.score.test), "; p-value=", format(p, scientific = T, digits = 2), sep=""),
xlim=c(x.min,x.max), cex=.5)
# Original title with more parameters
#main=paste(plot.name, " \ncount=", sum(binary.score.test), " of ", #length(binary.score.test), "; p-value=", format(p, scientific = T, digits = #2), "; FE=", format(e, scientific = F, digits = 2), "; Prop=", format(prop, #scientific = F, digits = 2), "; n(bg)=", bg, sep=""),
abline(v=sum(binary.score.test), lwd=3, col="red")
#return(list(count.criteria,z,p,e,prop,bg,sum(binary.score.test)))
}
geneset.perm.test(binary.score.reference, binary.score.test, 100000, plot.name)
}
par(mfrow=c(2,2), mai=c(0.9,0.9,0.9,0.9))
permutation_test(E12.5_GLM_SFARI_cat, "E12.5")
permutation_test(E14.5_GLM_SFARI_cat, "E14.5")
permutation_test(E17.5_GLM_SFARI_cat, "E17.5")
permutation_test(E19.5_GLM_SFARI_cat, "P0")
#Hypergeometric test
SFARI_genes_hypergeometric_test <- function(x){
#Anotations on the right refer to the post in the link
#https://stats.stackexchange.com/questions/16247/calculating-the-probability-of-gene-list-overlap-between-an-rna-seq-and-a-chip-c
#x <- E17.5_GLM_module_SFARI_cat
n <- length(as.character(x$gene_name)) #Total gene population 15k
a <- length(filter(x, PValue < 0.05)$gene_name) #RNA Seq enriched # 3000
b <- length(filter(x, SFARI_category %in% c(1,2))$gene_name) #Enriched by Chip-Seq 400
t <- length(filter(x, PValue < 0.05, SFARI_category %in% c(1,2))$gene_name) #Intersected 100
hypergeometric_test_p_value <- sum(dhyper(t:b, a, n - a, b))
hypergeometric_test_p_value
}
hyper_E12.5 <- SFARI_genes_hypergeometric_test(E12.5_GLM_SFARI_cat)
hyper_E14.5 <- SFARI_genes_hypergeometric_test(E14.5_GLM_SFARI_cat)
hyper_E17.5 <- SFARI_genes_hypergeometric_test(E17.5_GLM_SFARI_cat)
hyper_P0 <- SFARI_genes_hypergeometric_test(E19.5_GLM_SFARI_cat)
df_p_values <- data.frame("Age" = c("E12.5", "E14.5", "E17.5", "P0"),
"Permutation test P" = c("0.54", "0.87", "7.6e-06", "0.23"),
"Hypergeometric test P" = c(
format(hyper_E12.5, scientific = F, digits = 2),
format(hyper_E14.5, scientific = F, digits = 2),
format(hyper_E17.5, scientific = T, digits = 2),
format(hyper_P0, scientific = F, digits = 2)
))
colnames(df_p_values) <- c("Age", "Permutation test P", "Hypergeometric test P")
#df_p_values
knitr::kable(df_p_values,
format = "html", table.attr = "style='width:40%;'", align = "l", position = "left") %>%
kableExtra::kable_styling(position = "left")
| Age | Permutation test P | Hypergeometric test P |
|---|---|---|
| E12.5 | 0.54 | 0.32 |
| E14.5 | 0.87 | 0.6 |
| E17.5 | 7.6e-06 | 7.3e-06 |
| P0 | 0.23 | 0.93 |
###
# x is a gene name
# y is a plot title, in case there is an alternative gene name
rpkm_box_plot <- function(x, y){
rpkm_test <- as.data.frame(reshape2::melt(rpkm.data_linear_all[x,]))
rpkm_test_w_info <- cbind(rpkm_test, sample.info[,"ExperimentalDesign"])
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"DPC"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"Condition"]))
colnames(rpkm_test_w_info) <- c("RPKM", "treatment", "DPC", "Condition")
j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]
ggplot(rpkm_test_w_info, aes(x = DPC, y= RPKM, colour=Condition))+
geom_smooth(formula = y ~ x, method = "loess", se=T, aes(fill=Condition, group=Condition), size = 0.7, alpha=0)+
geom_boxplot(alpha=0, position="identity", size = 0.2)+
geom_jitter(size=2, pch = 19, width = 0.2, alpha = 0.5)+
theme_bw()+
theme(axis.text.x=element_text(angle=50, vjust=0.9, hjust=1, size=14))+
theme(axis.text.y=element_text(size=12))+
theme(axis.title.y=element_text(size=14))+
labs(title= y, x="")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
scale_color_manual(values = j_brew_colors)+
scale_x_discrete(breaks =c("12.5", "14.5", "17.5", "19.5"), labels = c("E12.5", "E14.5", "E17.5", "P0"))+
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(legend.position = "none")
}
genes <- c("Vegfa", "Flt1", "Pdk1", "Ldha", "Bnip3", "Il20rb")
pl <- lapply(genes, function(x) rpkm_box_plot(x,x))
marrangeGrob(pl, nrow=2, ncol=3, top = "",
layout_matrix = rbind(c(1,2,3), c(4,5,6)))
genes <- c("Mki67", "Eomes", "Pax6", "Sox9", "Tbr1", "Bcl11b", "Satb2", "Cux1")
gene_names <- c("Mki67 (Ki67)", "Eomes (Tbr2)", "Pax6", "Sox9", "Tbr1", "Bcl11b (Ctip2)", "Satb2", "Cux1")
df_genes <- data.frame(genes, gene_names)
pl <- lapply(1:nrow(df_genes), function(x) rpkm_box_plot(df_genes[x, 1], df_genes[x, 2]))
marrangeGrob(pl, nrow=4, ncol=2, top = "",
layout_matrix = rbind(c(1,2), c(3,4), c(5,6), c(7,8)))
genes <- c("Gfap", "Olig2", "Dlx2", "Gad1")
pl <- lapply(genes, function(x) rpkm_box_plot(x,x))
marrangeGrob(pl, nrow=2, ncol=3, top = "",
layout_matrix = rbind(c(1,2), c(3,4)))
genes <- c("Mki67", "Eomes", "Pax6", "Sox9")
gene_names <- c("Mki67 (Ki67)", "Eomes (Tbr2)", "Pax6", "Sox9")
df_genes <- data.frame(genes, gene_names)
pl <- lapply(1:nrow(df_genes), function(x) rpkm_box_plot(df_genes[x, 1], df_genes[x, 2]))
marrangeGrob(pl, nrow=2, ncol=2, top = "",
layout_matrix = rbind(c(1,2), c(3,4)))